让我们先来看一个只是用线性层的生成对抗网络(GAN),来简单了解一下GAN的基本网络结构与训练过程.
这个GAN网络结构分为两部分,生成器网络Generator和判别器网络Discriminator.
在所有的网络结构中我们都使用了LeakyReLU作为激活函数,除了G与D的最后一层,同时,我们在层与层之间我们还加入了BatchNormalization.
In [ ]:
import torch
torch.cuda.set_device(2)
In [ ]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
class Generator(nn.Module):
def __init__(self, image_size=32, latent_dim=100, output_channel=1):
"""
image_size: image with and height
latent dim: the dimension of random noise z
output_channel: the channel of generated image, for example, 1 for gray image, 3 for RGB image
"""
super(Generator, self).__init__()
self.latent_dim = latent_dim
self.output_channel = output_channel
self.image_size = image_size
# Linear layer: latent_dim -> 128 -> 256 -> 512 -> 1024 -> output_channel * image_size * image_size -> Tanh
self.model = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.BatchNorm1d(128),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(128, 256),
nn.BatchNorm1d(256),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(256, 512),
nn.BatchNorm1d(512),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(512, 1024),
nn.BatchNorm1d(1024),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(1024, output_channel * image_size * image_size),
nn.Tanh()
)
def forward(self, z):
img = self.model(z)
img = img.view(img.size(0), self.output_channel, self.image_size, self.image_size)
return img
class Discriminator(nn.Module):
def __init__(self, image_size=32, input_channel=1):
"""
image_size: image with and height
input_channel: the channel of input image, for example, 1 for gray image, 3 for RGB image
"""
super(Discriminator, self).__init__()
self.image_size = image_size
self.input_channel = input_channel
# Linear layer: input_channel * image_size * image_size -> 1024 -> 512 -> 256 -> 1 -> Sigmoid
self.model = nn.Sequential(
nn.Linear(input_channel * image_size * image_size, 1024),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(1024, 512),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(512, 256),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(256, 1),
nn.Sigmoid(),
)
def forward(self, img):
img_flat = img.view(img.size(0), -1)
out = self.model(img_flat)
return out
在训练我们的GAN网络之前, 先介绍一下本次实验可训练GAN的数据集,我们提供了两个数据集来供大家进行尝试数据集.
下面是两个加载数据集的函数.注意我们将所有图片normalize到了[-1,1]之间.
In [ ]:
def load_mnist_data():
"""
load mnist(0,1,2) dataset
"""
transform = torchvision.transforms.Compose([
# transform to 1-channel gray image since we reading image in RGB mode
transforms.Grayscale(1),
# resize image from 28 * 28 to 32 * 32
transforms.Resize(32),
transforms.ToTensor(),
# normalize with mean=0.5 std=0.5
transforms.Normalize(mean=(0.5, ),
std=(0.5, ))
])
train_dataset = torchvision.datasets.ImageFolder(root='./data/mnist', transform=transform)
return train_dataset
def load_furniture_data():
"""
load furniture dataset
"""
transform = torchvision.transforms.Compose([
transforms.ToTensor(),
# normalize with mean=0.5 std=0.5
transforms.Normalize(mean=(0.5, 0.5, 0.5),
std=(0.5, 0.5, 0.5))
])
train_dataset = torchvision.datasets.ImageFolder(root='./data/household_furniture', transform=transform)
return train_dataset
(无需阅读理解)运行下面2个cell的代码来查看两个数据集中的20张随机真实图片.
In [ ]:
def denorm(x):
# denormalize
out = (x + 1) / 2
return out.clamp(0, 1)
In [ ]:
from utils import show
"""
you can pass code in this cell
"""
# show mnist real data
train_dataset = load_mnist_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
show(torchvision.utils.make_grid(denorm(next(iter(trainloader))[0]), nrow=5))
# show furniture real data
train_dataset = load_furniture_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
show(torchvision.utils.make_grid(denorm(next(iter(trainloader))[0]), nrow=5))
下面代码实现GAN在一个epoch内的训练过程.
大体而言,GAN的训练过程分为两步,首先将随机噪声z喂给G,生成图片,然后将真实图片和G生成的图片喂给D,然后使用对应的loss函数反向传播优化D.然后再次使用G生成图片,并喂给D,并使用对应的loss函数反向传播优化G.
下面的图片是普通的GAN在G和D上的优化目标:
而对于G,也通过最小化一个BCEloss来实现,即将生成图片$z\sim{P(z)}$的标签设置为1即可,我们可以看到这样的损失函数与其优化目标是一致的.
In [ ]:
def train(trainloader, G, D, G_optimizer, D_optimizer, loss_func, device, z_dim):
"""
train a GAN with model G and D in one epoch
Args:
trainloader: data loader to train
G: model Generator
D: model Discriminator
G_optimizer: optimizer of G(etc. Adam, SGD)
D_optimizer: optimizer of D(etc. Adam, SGD)
loss_func: loss function to train G and D. For example, Binary Cross Entropy(BCE) loss function
device: cpu or cuda device
z_dim: the dimension of random noise z
"""
# set train mode
D.train()
G.train()
D_total_loss = 0
G_total_loss = 0
for i, (x, _) in enumerate(trainloader):
# real label and fake label
y_real = torch.ones(x.size(0), 1).to(device)
y_fake = torch.zeros(x.size(0), 1).to(device)
x = x.to(device)
z = torch.rand(x.size(0), z_dim).to(device)
# update D network
# D optimizer zero grads
D_optimizer.zero_grad()
# D real loss from real images
d_real = D(x)
d_real_loss = loss_func(d_real, y_real)
# D fake loss from fake images generated by G
g_z = G(z)
d_fake = D(g_z)
d_fake_loss = loss_func(d_fake, y_fake)
# D backward and step
d_loss = d_real_loss + d_fake_loss
d_loss.backward()
D_optimizer.step()
# update G network
# G optimizer zero grads
G_optimizer.zero_grad()
# G loss
g_z = G(z)
d_fake = D(g_z)
g_loss = loss_func(d_fake, y_real)
# G backward and step
g_loss.backward()
G_optimizer.step()
D_total_loss += d_loss.item()
G_total_loss += g_loss.item()
return D_total_loss / len(trainloader), G_total_loss / len(trainloader)
当模型训练后,我们需要查看此时G生成的图片效果,下面的visualize_results代码便实现了这块内容.注意,我们生成的图片都在[-1,1],因此,我们需要将图片反向归一化(denorm)到[0,1].
In [ ]:
def visualize_results(G, device, z_dim, result_size=20):
G.eval()
z = torch.rand(result_size, z_dim).to(device)
g_z = G(z)
show(torchvision.utils.make_grid(denorm(g_z.detach().cpu()), nrow=5))
万事具备,接下来让我们来尝试这训练一个基本的GAN网络吧.这里实现run_gan函数来调用train以及visualize_results来训练我们的GAN.
In [ ]:
def run_gan(trainloader, G, D, G_optimizer, D_optimizer, loss_func, n_epochs, device, latent_dim):
d_loss_hist = []
g_loss_hist = []
for epoch in range(n_epochs):
d_loss, g_loss = train(trainloader, G, D, G_optimizer, D_optimizer, loss_func, device,
z_dim=latent_dim)
print('Epoch {}: Train D loss: {:.4f}, G loss: {:.4f}'.format(epoch, d_loss, g_loss))
d_loss_hist.append(d_loss)
g_loss_hist.append(g_loss)
if epoch == 0 or (epoch + 1) % 10 == 0:
visualize_results(G, device, latent_dim)
return d_loss_hist, g_loss_hist
设置好超参数就可以开始训练!让我们尝试用它来训练2类的mnist数据集
In [ ]:
# hyper params
# z dim
latent_dim = 100
# image size and channel
image_size=32
image_channel=1
# Adam lr and betas
learning_rate = 0.0002
betas = (0.5, 0.999)
# epochs and batch size
n_epochs = 100
batch_size = 32
# device : cpu or cuda:0/1/2/3
device = torch.device('cuda:2')
# mnist dataset and dataloader
train_dataset = load_mnist_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# use BCELoss as loss function
bceloss = nn.BCELoss().to(device)
# G and D model
G = Generator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = Discriminator(image_size=image_size, input_channel=image_channel).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
In [ ]:
d_loss_hist, g_loss_hist = run_gan(trainloader, G, D, G_optimizer, D_optimizer, bceloss,
n_epochs, device, latent_dim)
训练完后,让我们来看一下G生成的图片效果,可以看到即使是一个简单的GAN在这种简单的数据集上的生成效果还是不错的,虽然仍然存在不少瑕疵,比如说我们可以看到生成的图片上的数字有很多奇怪的雪花等等.
让我们看一下G和D的loss变化曲线(运行下方语句.)
In [ ]:
from utils import loss_plot
In [ ]:
loss_plot(d_loss_hist, g_loss_hist)
答:
CNN的loss曲线通常是一开始下降得很快,等到迭代多次之后,loss曲线下降的速度将变慢,并且可能产生振荡,loss值可能上升。这是因为CNN训练一开始距离极值还很远,训练效率高。等到多次迭代之后,CNN可能已经接近拟合,甚至可能出现过拟合,导致loss值振荡。
而生成网络和判别网络的loss曲线与CNN的曲线都有明显区别,生成网络的loss值是逐渐变大,判别网络的loss值是逐渐变小。一开始,生成网络的loss值急剧上升,因为一开始生成网络受真实图片的影响小。随着迭代次数的增多,判别网络判断真假图的能力增大,生成网络的loss值迅速增大,而判别网络的loss值下降。
迭代多次之后,生成网络和判别网络开始对抗,生成网络的loss值可能因此有下降的概率,而判别网络也因此有上升的概率。因为生成网络生成假图以假乱真的能力增大,判别网络判别真假图的能力也增大,两者开始对抗。
在长期的若干次迭代中,生成网络的loss值趋向于增大,而判别网络趋向于下降。因为数据集的数量是有限的,网络最终必然倾向于拟合。loss曲线中判别器下降而生成器上升,是因为判别网络的判别能力增大的速度比生成网络的能力要快,也可能是生成网络已经达到拟合。
在DCGAN(Deep Convolution GAN)中,最大的改变是使用了CNN代替全连接层.在生成器G中,使用stride为2的转置卷积来生成图片同时扩大图片尺寸,而在判别器D中,使用stride为2的卷积来将图片进行卷积并下采样.除此之外,DCGAN加入了在层与层之间BatchNormalization(虽然我们在普通的GAN中就已经添加),在G中使用ReLU作为激活函数,而在D中使用LeakyReLU作为激活函数.
In [ ]:
from utils import initialize_weights
class DCGenerator(nn.Module):
def __init__(self, image_size=32, latent_dim=64, output_channel=1):
super(DCGenerator, self).__init__()
self.image_size = image_size
self.latent_dim = latent_dim
self.output_channel = output_channel
self.init_size = image_size // 8
# fc: Linear -> BN -> ReLU
self.fc = nn.Sequential(
nn.Linear(latent_dim, 512 * self.init_size ** 2),
nn.BatchNorm1d(512 * self.init_size ** 2),
nn.ReLU(inplace=True)
)
# deconv: ConvTranspose2d(4, 2, 1) -> BN -> ReLU ->
# ConvTranspose2d(4, 2, 1) -> BN -> ReLU ->
# ConvTranspose2d(4, 2, 1) -> Tanh
self.deconv = nn.Sequential(
nn.ConvTranspose2d(512, 256, 4, stride=2, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(128, output_channel, 4, stride=2, padding=1),
nn.Tanh(),
)
initialize_weights(self)
def forward(self, z):
out = self.fc(z)
out = out.view(out.shape[0], 512, self.init_size, self.init_size)
img = self.deconv(out)
return img
class DCDiscriminator(nn.Module):
def __init__(self, image_size=32, input_channel=1, sigmoid=True):
super(DCDiscriminator, self).__init__()
self.image_size = image_size
self.input_channel = input_channel
self.fc_size = image_size // 8
# conv: Conv2d(3,2,1) -> LeakyReLU
# Conv2d(3,2,1) -> BN -> LeakyReLU
# Conv2d(3,2,1) -> BN -> LeakyReLU
self.conv = nn.Sequential(
nn.Conv2d(input_channel, 128, 3, 2, 1),
nn.LeakyReLU(0.2),
nn.Conv2d(128, 256, 3, 2, 1),
nn.BatchNorm2d(256),
nn.LeakyReLU(0.2),
nn.Conv2d(256, 512, 3, 2, 1),
nn.BatchNorm2d(512),
nn.LeakyReLU(0.2),
)
# fc: Linear -> Sigmoid
self.fc = nn.Sequential(
nn.Linear(512 * self.fc_size * self.fc_size, 1),
)
if sigmoid:
self.fc.add_module('sigmoid', nn.Sigmoid())
initialize_weights(self)
def forward(self, img):
out = self.conv(img)
out = out.view(out.shape[0], -1)
out = self.fc(out)
return out
同样的,我们使用同样的mnist数据集对DCGAN进行训练.
In [ ]:
# hyper params
# z dim
latent_dim = 100
# image size and channel
image_size=32
image_channel=1
# Adam lr and betas
learning_rate = 0.0002
betas = (0.5, 0.999)
# epochs and batch size
n_epochs = 100
batch_size = 32
# device : cpu or cuda:0/1/2/3
device = torch.device('cuda:2')
# mnist dataset and dataloader
train_dataset = load_mnist_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# use BCELoss as loss function
bceloss = nn.BCELoss().to(device)
# G and D model, use DCGAN
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
In [ ]:
d_loss_hist, g_loss_hist = run_gan(trainloader, G, D, G_optimizer, D_optimizer, bceloss,
n_epochs, device, latent_dim)
In [ ]:
loss_plot(d_loss_hist, g_loss_hist)
可以看到,DCGAN的生成图片质量比起只有线性层的GAN要好不少.接下来,让我们尝试使用家具数据集来训练DCGAN.
In [ ]:
# RGB image channel = 3
image_channel=3
# epochs
n_epochs = 300
batch_size = 32
image_size=32
latent_dim = 100
device = torch.device('cuda:2')
learning_rate = 0.0002
betas = (0.5, 0.999)
bceloss = nn.BCELoss().to(device)
# mnist dataset and dataloader
train_dataset = load_furniture_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# G and D model, use DCGAN
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist, g_loss_hist = run_gan(trainloader, G, D, G_optimizer, D_optimizer, bceloss,
n_epochs, device, latent_dim)
In [ ]:
loss_plot(d_loss_hist, g_loss_hist)
In [ ]:
class L2Loss(nn.Module):
def __init__(self):
super(L2Loss, self).__init__()
def forward(self, input_, target):
"""
input_: (batch_size*1)
target: (batch_size*1) labels, 1 or 0
"""
return ((input_ - target) ** 2).mean()
完成上方代码后,使用所写的L2Loss在mnist数据集上训练DCGAN.
In [ ]:
# hyper params
# z dim
latent_dim = 100
# image size and channel
image_size=32
image_channel=1
# Adam lr and betas
learning_rate = 0.0002
betas = (0.5, 0.999)
# epochs and batch size
n_epochs = 100
batch_size = 32
# device : cpu or cuda:0/1/2/3
device = torch.device('cuda:2')
# mnist dataset and dataloader
train_dataset = load_mnist_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# use L2Loss as loss function
l2loss = L2Loss().to(device)
# G and D model, use DCGAN
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
In [ ]:
d_loss_hist, g_loss_hist = run_gan(trainloader, G, D, G_optimizer, D_optimizer, l2loss, n_epochs, device,
latent_dim)
loss_plot(d_loss_hist, g_loss_hist)
GAN依然存在着训练不稳定,模式崩溃(collapse mode,可以理解为生成的图片多样性极低)的问题(我们的数据集不一定能体现出来).WGAN(Wasserstein GAN)将传统GAN中拟合的JS散度改为Wasserstein距离.WGAN一定程度上解决了GAN训练不稳定以及模式奔溃的问题.
WGAN的判别器的优化目标变为,在满足Lipschitz连续的条件(我们可以限制w不超过某个范围来满足)下,最大化
具体到在实现上,WGAN主要有3点改变:
所以我们主要重写了WGAN的训练函数,在这里,网络结构使用去除Sigmoid的DCGAN(注意初始化D时将sigmoid设置为False来去掉最后一层sigmoid).
下面是WGAN的代码实现.加入了两个参数,n_d表示每训练一次G训练D的次数,weight_clip表示截断的常数.
In [ ]:
def wgan_train(trainloader, G, D, G_optimizer, D_optimizer, device, z_dim, n_d=2, weight_clip=0.01):
"""
n_d: the number of iterations of D update per G update iteration
weight_clip: the clipping parameters
"""
D.train()
G.train()
D_total_loss = 0
G_total_loss = 0
for i, (x, _) in enumerate(trainloader):
x = x.to(device)
# update D network
# D optimizer zero grads
D_optimizer.zero_grad()
# D real loss from real images
d_real = D(x)
d_real_loss = - d_real.mean()
# D fake loss from fake images generated by G
z = torch.rand(x.size(0), z_dim).to(device)
g_z = G(z)
d_fake = D(g_z)
d_fake_loss = d_fake.mean()
# D backward and step
d_loss = d_real_loss + d_fake_loss
d_loss.backward()
D_optimizer.step()
# D weight clip
for params in D.parameters():
params.data.clamp_(-weight_clip, weight_clip)
D_total_loss += d_loss.item()
# update G network
if (i + 1) % n_d == 0:
# G optimizer zero grads
G_optimizer.zero_grad()
# G loss
g_z = G(z)
d_fake = D(g_z)
g_loss = - d_fake.mean()
# G backward and step
g_loss.backward()
G_optimizer.step()
G_total_loss += g_loss.item()
return D_total_loss / len(trainloader), G_total_loss * n_d / len(trainloader)
In [ ]:
def run_wgan(trainloader, G, D, G_optimizer, D_optimizer, n_epochs, device, latent_dim, n_d, weight_clip):
d_loss_hist = []
g_loss_hist = []
for epoch in range(n_epochs):
d_loss, g_loss = wgan_train(trainloader, G, D, G_optimizer, D_optimizer, device,
z_dim=latent_dim, n_d=n_d, weight_clip=weight_clip)
print('Epoch {}: Train D loss: {:.4f}, G loss: {:.4f}'.format(epoch, d_loss, g_loss))
d_loss_hist.append(d_loss)
g_loss_hist.append(g_loss)
if epoch == 0 or (epoch + 1) % 10 == 0:
visualize_results(G, device, latent_dim)
return d_loss_hist, g_loss_hist
接下来让我们使用写好的run_wgan来跑我们的家具(椅子)数据集,看看效果如何.
In [ ]:
# hyper params
# z dim
latent_dim = 100
# image size and channel
image_size=32
image_channel=3
# Adam lr and betas
learning_rate = 0.0002
betas = (0.5, 0.999)
# epochs and batch size
n_epochs = 300
batch_size = 32
# n_d: the number of iterations of D update per G update iteration
n_d = 2
weight_clip=0.01
# device : cpu or cuda:0/1/2/3
device = torch.device('cuda:2')
# mnist dataset and dataloader
train_dataset = load_furniture_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
In [ ]:
# G and D model, use DCGAN, note that sigmoid is removed in D
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel, sigmoid=False).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist, g_loss_hist = run_wgan(trainloader, G, D, G_optimizer, D_optimizer, n_epochs, device,
latent_dim, n_d, weight_clip)
由WGAN的原理我们知道,D_loss的相反数可以表示生成数据分布与真实分布的Wasserstein距离,其数值越小,表明两个分布越相似,GAN训练得越好.它的值给我们训练GAN提供了一个指标.
运行下方代码观察wgan的loss曲线,可以看到,总体上,D_loss的相反数随着epoch数增加逐渐下降,同时生成的数据也越来越逼近真实数据,这与wgan的原理是相符合的.
In [ ]:
loss_plot(d_loss_hist, g_loss_hist)
接下来运行下面两个cell的代码,集中展示wgan的参数分布.
In [ ]:
from utils import show_weights_hist
def show_d_params(D):
plist = []
for params in D.parameters():
plist.extend(params.cpu().data.view(-1).numpy())
show_weights_hist(plist)
In [ ]:
show_d_params(D)
可以看到,参数都被截断在[-c, c]之间,大部分参数集中在-c和c附近.
答:
n_d
为1时结果最好,虽然说n_d
代表的是每训练一次G就训练多少次D,但在网络中是先训练D的,也就是每n_d
批数据,才训练一次G。n_d
为1时,G的训练次数是最多的。
In [ ]:
n_d = 1
# G and D model, use DCGAN, note that sigmoid is removed in D
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel, sigmoid=False).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist, g_loss_hist = run_wgan(trainloader, G, D, G_optimizer, D_optimizer, n_epochs, device,
latent_dim, n_d, weight_clip)
loss_plot(d_loss_hist, g_loss_hist)
In [ ]:
n_d = 3
# G and D model, use DCGAN, note that sigmoid is removed in D
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel, sigmoid=False).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist, g_loss_hist = run_wgan(trainloader, G, D, G_optimizer, D_optimizer, n_epochs, device,
latent_dim, n_d, weight_clip)
loss_plot(d_loss_hist, g_loss_hist)
In [ ]:
n_d = 5
# G and D model, use DCGAN, note that sigmoid is removed in D
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel, sigmoid=False).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist, g_loss_hist = run_wgan(trainloader, G, D, G_optimizer, D_optimizer, n_epochs, device,
latent_dim, n_d, weight_clip)
loss_plot(d_loss_hist, g_loss_hist)
在WGAN中,需要进行截断, 在实验中发现: 对于比较深的WAGN,它不容易收敛。
大致原因如下:
所以WGAN-GP使用了Gradient penalty(梯度惩罚)来代替clip.
因为Lipschitz限制是要求判别器的梯度不超过K,所以可以直接使用一个loss term来实现这一点,所以改进后D的优化目标改进为如下:
下面是WGAN-GP的具体代码实现,同WGAN,我们也只实现了他的训练代码,而模型我们直接使用DCGAN的模型.
In [ ]:
import torch.autograd as autograd
def wgan_gp_train(trainloader, G, D, G_optimizer, D_optimizer, device, z_dim, lambda_=10, n_d=2):
D.train()
G.train()
D_total_loss = 0
G_total_loss = 0
for i, (x, _) in enumerate(trainloader):
x = x.to(device)
# update D network
# D optimizer zero grads
D_optimizer.zero_grad()
# D real loss from real images
d_real = D(x)
d_real_loss = - d_real.mean()
# D fake loss from fake images generated by G
z = torch.rand(x.size(0), z_dim).to(device)
g_z = G(z)
d_fake = D(g_z)
d_fake_loss = d_fake.mean()
# D gradient penalty
# a random number epsilon
epsilon = torch.rand(x.size(0), 1, 1, 1).cuda()
x_hat = epsilon * x + (1 - epsilon) * g_z
x_hat.requires_grad_(True)
y_hat = D(x_hat)
# computes the sum of gradients of y_hat with regard to x_hat
gradients = autograd.grad(outputs=y_hat, inputs=x_hat, grad_outputs=torch.ones(y_hat.size()).cuda(),
create_graph=True, retain_graph=True, only_inputs=True)[0]
# computes gradientpenalty
gradient_penalty = torch.mean((gradients.view(gradients.size()[0], -1).norm(p=2, dim=1) - 1) ** 2)
# D backward and step
d_loss = d_real_loss + d_fake_loss + lambda_ * gradient_penalty
d_loss.backward()
D_optimizer.step()
D_total_loss += d_loss.item()
# update G network
# G optimizer zero grads
if (i + 1) % n_d == 0:
G_optimizer.zero_grad()
# G loss
g_z = G(z)
d_fake = D(g_z)
g_loss = - d_fake.mean()
# G backward and step
g_loss.backward()
G_optimizer.step()
G_total_loss += g_loss.item()
return D_total_loss / len(trainloader), G_total_loss * n_d / len(trainloader)
In [ ]:
# hyper params
# z dim
latent_dim = 100
# image size and channel
image_size=32
image_channel=3
# Adam lr and betas
learning_rate = 0.0002
betas = (0.5, 0.999)
# epochs and batch size
n_epochs = 300
batch_size = 32
# device : cpu or cuda:0/1/2/3
device = torch.device('cuda:2')
# n_d: train D
n_d = 2
lambda_ = 10
# mnist dataset and dataloader
train_dataset = load_furniture_data()
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# G and D model, use DCGAN, note that sigmoid is removed in D
G = DCGenerator(image_size=image_size, latent_dim=latent_dim, output_channel=image_channel).to(device)
D = DCDiscriminator(image_size=image_size, input_channel=image_channel, sigmoid=False).to(device)
# G and D optimizer, use Adam or SGD
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate, betas=betas)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate, betas=betas)
d_loss_hist = []
g_loss_hist = []
for epoch in range(n_epochs):
d_loss, g_loss = wgan_gp_train(trainloader, G, D, G_optimizer, D_optimizer, device,
z_dim=latent_dim, lambda_=lambda_, n_d=n_d)
print('Epoch {}: Train D loss: {:.4f}, G loss: {:.4f}'.format(epoch, d_loss, g_loss))
d_loss_hist.append(d_loss)
g_loss_hist.append(g_loss)
if epoch == 0 or (epoch + 1) % 10 == 0:
visualize_results(G, device, latent_dim)
同理,观察loss曲线和D上的参数分布.
In [ ]:
loss_plot(d_loss_hist, g_loss_hist)
In [ ]:
show_d_params(D)
答:
可以观察到,在相同epoch时WGAN-GP生成的图片效果更加好。相比WGAN,WGAN-GP的收敛速度更加快。在训练的一开始可以看到WGAN-GP很快就能生成了图片物体的大致轮廓,而WGAN相对慢一点。
WGAN的loss曲线中,D的loss曲线的相反数和G的loss曲线是逐渐下降的。而WGAN-GP的loss曲线中,G的loss值是逐渐上升的。
WGAN中的D的参数主要分布在-c和c两处,而WGAN-GP的D的参数主要分布在0左右。